!
! Copyright (C) 2001-2016 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!----------------------------------------------------------------------
subroutine dvqpsi_us (ik, uact, addnlcc, becp1, alphap)
  !----------------------------------------------------------------------
  !! This routine calculates \(dV_\text{bare}/d\tau \cdot \psi\) for one 
  !! perturbation with a given q. The displacements are described by a 
  !! vector u.  
  !! The result is stored in \(\text{dvpsi}\). The routine is called for
  !! each k-point and for each pattern u. It computes simultaneously all 
  !! the bands. It implements Eq. (B29) of PRB 64, 235118 (2001). The 
  !! contribution of the local pseudopotential is calculated here, that 
  !! of the nonlocal pseudopotential in \(\texttt{dvqpsi_us_only}\).
  !
  !
  USE kinds, only : DP
  USE funct,     ONLY : dft_is_nonlocc
  USE xc_lib,    ONLY : xclib_dft_is
  USE ions_base, ONLY : nat, ityp
  USE cell_base, ONLY : tpiba
  USE fft_base,  ONLY : dfftp, dffts
  USE fft_interfaces, ONLY: fwfft, invfft
  USE gvect,     ONLY : eigts1, eigts2, eigts3, mill, g, &
                        ngm
  USE gvecs,     ONLY : ngms, doublegrid
  USE lsda_mod,  ONLY : nspin, lsda, isk
  USE scf,       ONLY : rho, rho_core
  USE noncollin_module, ONLY : nspin_gga, npol
  use uspp_param,ONLY : upf
  USE wvfct,     ONLY : nbnd, npwx
  USE wavefunctions,  ONLY: evc
  USE wavefunctions_gpum, ONLY: evc_d
  USE nlcc_ph,    ONLY : drc
  USE uspp,       ONLY : nlcc_any
  USE eqv,        ONLY : dvpsi, dmuxc, vlocq
  USE qpoint,     ONLY : xq, eigqts, ikqs, ikks
  USE klist,      ONLY : ngk, igk_k
  USE gc_lr,      ONLY: grho, dvxc_rr,  dvxc_sr,  dvxc_ss, dvxc_s

  USE Coul_cut_2D, ONLY: do_cutoff_2D  
  USE Coul_cut_2D_ph, ONLY : cutoff_localq
  USE qpoint,     ONLY : nksq
  USE becmod,     ONLY : bec_type
  ! 
  IMPLICIT NONE
  !
  !   The dummy variables
  !
  INTEGER, INTENT(in) :: ik
  !! input: the k point
  COMPLEX(DP) :: uact(3*nat)
  !! input: the pattern of displacements
  LOGICAL :: addnlcc
  TYPE(bec_type) :: becp1(nksq), alphap(3,nksq)
  !
  ! ... local variables
  !
  INTEGER ::  na  
  !! counter on atoms
  INTEGER :: mu
  !! counter on modes
  INTEGER :: npw
  !! Number of pw
  INTEGER :: ikk
  !! the point k
  INTEGER :: npwq
  !! Number of q
  INTEGER :: ikq
  !! k-q index
  INTEGER :: iks
  !!
  INTEGER :: ig
  !! counter on G vectors
  INTEGER :: nt
  !! the type of atom
  INTEGER :: ibnd
  !! counter on bands
  INTEGER :: ir 
  !! counter on real mesh
  INTEGER :: is
  !! 
  INTEGER :: ip, nnr, nnp, itmp, itmpp
  !!
  complex(DP) :: gtau, gu, fact, u1, u2, u3, gu0
  complex(DP) , allocatable :: aux (:,:)
  complex(DP) , allocatable :: aux1 (:), aux2 (:)
  complex(DP) , pointer :: auxs (:)
  COMPLEX(DP), ALLOCATABLE :: drhoc(:,:)
  !
#if defined(__CUDA)
  INTEGER, POINTER, DEVICE :: nl_d(:), nlp_d(:)
  !
  nl_d  => dffts%nl_d
  nlp_d  => dfftp%nl_d
#else
  INTEGER, ALLOCATABLE :: nl_d(:)
  INTEGER, ALLOCATABLE :: nlp_d(:)
  !
  ALLOCATE( nl_d(dffts%ngm) )
  ALLOCATE( nlp_d(dfftp%ngm) )
  nl_d  = dffts%nl
  nlp_d  = dfftp%nl
#endif
  ! 
  call start_clock_gpu ('dvqpsi_us')
  allocate (aux1(dffts%nnr))
  allocate (aux2(dffts%nnr))
  !
  !    We start by computing the contribution of the local potential.
  !    The computation of the derivative of the local potential is done in
  !    reciprocal space while the product with the wavefunction is done in
  !    real space
  !
  ikk = ikks(ik)
  ikq = ikqs(ik)
  npw = ngk(ikk)
  npwq= ngk(ikq)
  nnr = dffts%nnr
  ! 
  !$acc data create(aux1(1:nnr),aux2(1:nnr)) copyout(dvpsi) copyin(vlocq,drc,dmuxc) present( igk_k ) deviceptr(evc_d, nl_d, nlp_d)
  !$acc kernels present(dvpsi,aux1)
  dvpsi(:,:) = (0.d0, 0.d0)
  aux1(:) = (0.d0, 0.d0)
  !$acc end kernels
  do na = 1, nat
     fact = tpiba * (0.d0, -1.d0) * eigqts (na)
     mu = 3 * (na - 1)
     if (abs (uact (mu + 1) ) + abs (uact (mu + 2) ) + abs (uact (mu + &
          3) ) .gt.1.0d-12) then
        nt = ityp (na)
        u1 = uact (mu + 1)
        u2 = uact (mu + 2)
        u3 = uact (mu + 3)
        gu0 = xq (1) * u1 + xq (2) * u2 + xq (3) * u3
        !$acc parallel loop present(eigts1, eigts2, eigts3, mill, g, aux1) 
        do ig = 1, ngms
           gtau = eigts1 (mill(1,ig), na) * eigts2 (mill(2,ig), na) * &
                  eigts3 (mill(3,ig), na)
           gu = gu0 + g (1, ig) * u1 + g (2, ig) * u2 + g (3, ig) * u3
           itmp = nl_d (ig)
           aux1 (itmp ) = aux1 ( itmp ) + vlocq (ig, nt) * gu * &
                fact * gtau
        enddo
        IF (do_cutoff_2D) then  
           !$acc update host(aux1)
           call cutoff_localq( aux1, fact, u1, u2, u3, gu0, nt, na)
           !$acc update device(aux1) 
        ENDIF
        !
     endif
  enddo
  !
  ! add NLCC when present
  !
  if (nlcc_any.and.addnlcc) then
     !CALL errore ('dvqpsi_us', 'openacc fpr ncll_any to be checked', 1)
     allocate (drhoc( dfftp%nnr,nspin))
     allocate (aux( dfftp%nnr,nspin))
     nnp=dfftp%nnr
     !$acc enter data create(drhoc(1:nnp,1:nspin),aux(1:nnp,1:nspin)) 
     !$acc kernels present(drhoc,aux) 
     drhoc(:,:) = (0.d0, 0.d0)
     aux(:,:) = (0.0_dp, 0.0_dp)
     !$acc end kernels 
     do na = 1,nat
        fact = tpiba*(0.d0,-1.d0)*eigqts(na)
        mu = 3*(na-1)
        if (abs(uact(mu+1))+abs(uact(mu+2))  &
                        +abs(uact(mu+3)).gt.1.0d-12) then
           nt=ityp(na)
           u1 = uact(mu+1)
           u2 = uact(mu+2)
           u3 = uact(mu+3)
           gu0 = xq(1)*u1 +xq(2)*u2+xq(3)*u3
           if (upf(nt)%nlcc) then
              !$acc parallel loop present(eigts1, eigts2, eigts3, g, mill,drhoc) 
              do ig = 1,ngm
                 gtau = eigts1(mill(1,ig),na)*   &
                        eigts2(mill(2,ig),na)*   &
                        eigts3(mill(3,ig),na)
                 gu = gu0+g(1,ig)*u1+g(2,ig)*u2+g(3,ig)*u3
                 itmp = nlp_d(ig)
                 drhoc(itmp,1)=drhoc(itmp,1)+drc(ig,nt)*gu*fact*gtau
              enddo
           endif
        endif
     enddo
     !$acc host_data use_device(drhoc)
     CALL invfft ('Rho', drhoc(:,1), dfftp)
     !$acc end host_data
     if (.not.lsda) then
        !$acc parallel loop present(aux,drhoc) 
        do ir=1,nnp
           aux(ir,1) = drhoc(ir,1) * dmuxc(ir,1,1)
        end do
     else
        is=isk(ikk)
        !$acc parallel loop present(drhoc,aux) copyin(is)
        do ir=1,nnp
           drhoc(ir,1) = 0.5d0 * drhoc(ir,1)
           drhoc(ir,2) = drhoc(ir,1)
           aux(ir,1) = drhoc(ir,1) * ( dmuxc(ir,is,1) + &
                                       dmuxc(ir,is,2) )
        enddo
     endif
     rho%of_r(:,1) = rho%of_r(:,1) + rho_core(:)
     !$acc exit data copyout(drhoc)

     IF ( xclib_dft_is('gradient') ) THEN
                    !$acc update host(aux) 
                    CALL dgradcorr (dfftp, rho%of_r, grho, dvxc_rr, &
                    dvxc_sr, dvxc_ss, dvxc_s, xq, drhoc, nspin, nspin_gga, g, aux)    
                    !$acc update device(aux)
     END IF   
     IF (dft_is_nonlocc()) THEN
             !$acc update host(aux)               ! to fix double update
             CALL dnonloccorr(rho%of_r, drhoc, xq, aux)
             !$acc update device(aux)
     END IF
     deallocate (drhoc)

     rho%of_r(:,1) = rho%of_r(:,1) - rho_core(:)

     !$acc host_data use_device(aux)
     CALL fwfft ('Rho', aux(:,1), dfftp)
     !$acc end host_data
! 
!  This is needed also when the smooth and the thick grids coincide to
!  cut the potential at the cut-off
!
     allocate (auxs(dffts%nnr))
     !$acc enter data create(auxs(1:nnr))
     !$acc kernels present(auxs)
     auxs(:) = (0.d0, 0.d0)
     !$acc end kernels
     !$acc parallel loop present(auxs,aux)
     do ig=1,ngms
        itmp = nl_d(ig)
        itmpp = nlp_d(ig) 
        auxs(itmp) = aux(itmpp,1)
     enddo
     !$acc kernels present(aux1,auxs)
     aux1(:) = aux1(:) + auxs(:)
     !$acc end kernels
     !$acc exit data delete(aux, auxs)
     deallocate (aux)
     deallocate (auxs)
  endif
  !
  ! Now we compute dV_loc/dtau in real space
  !
#if defined(__CUDA)
  evc_d = evc
#endif
  !$acc host_data use_device(aux1)
  CALL invfft ('Rho', aux1, dffts)
  !$acc end host_data
  do ibnd = 1, nbnd
     do ip=1,npol
        !$acc kernels present(aux2)
        aux2(:) = (0.d0, 0.d0)
        !$acc end kernels
        if (ip==1) then
           !$acc parallel loop present(aux2, igk_k) 
           do ig = 1, npw
              itmp = nl_d (igk_k (ig,ikk) )
#if defined(__CUDA)
              aux2 ( itmp ) = evc_d (ig, ibnd)
#else
              aux2 ( itmp ) = evc (ig, ibnd)
#endif
           enddo
        else
           !$acc parallel loop present(aux2, igk_k)
           do ig = 1, npw
              itmp = nl_d (igk_k (ig,ikk) )
#if defined(__CUDA)
              aux2 ( itmp ) = evc_d (ig+npwx, ibnd)
#else
              aux2 ( itmp ) = evc (ig+npwx, ibnd)
#endif
           enddo
        end if
        !
        !  This wavefunction is computed in real space
        !
        !$acc host_data use_device(aux2)
        CALL invfft ('Wave', aux2, dffts)
        !$acc end host_data
        !$acc parallel loop present(aux2, aux1)
        do ir = 1, nnr
           aux2 (ir) = aux2 (ir) * aux1 (ir)
        enddo
        !
        ! and finally dV_loc/dtau * psi is transformed in reciprocal space
        !
        !$acc host_data use_device(aux2)
        CALL fwfft ('Wave', aux2, dffts)
        !$acc end host_data
        if (ip==1) then
           !$acc parallel loop present( aux2, igk_k, dvpsi )
           do ig = 1, npwq
              itmp = nl_d (igk_k (ig,ikq) )
              dvpsi (ig, ibnd) = aux2 ( itmp )
           enddo
        else
           !$acc parallel loop present( aux2, igk_k, dvpsi )
           do ig = 1, npwq
              itmp = nl_d (igk_k (ig,ikq) )
              dvpsi (ig+npwx, ibnd) = aux2 ( itmp )
           enddo
        end if
     enddo
  enddo
  !$acc end data
  !
  deallocate (aux2)
  deallocate (aux1)
  !
  !   We add the contribution of the nonlocal potential in the US form
  !   First a term similar to the KB case.
  !   Then a term due to the change of the D coefficients.
  !
  call dvqpsi_us_only (ik, uact, becp1, alphap)

  call stop_clock_gpu ('dvqpsi_us')
#if !defined(__CUDA)
  DEALLOCATE(nl_d)
  DEALLOCATE(nlp_d)
#endif
  return
end subroutine dvqpsi_us
